{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<img src=\"https://deecamp.chuangxin.com/assets/image/logo_nav_zh.jpg\" width=\"40%\">"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 此文代码为主线代码，主要负责提取原始数据和转换为可用数据\n",
    "# 不要修改此文件，复制此文件到自己文件夹后再操作"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1.spark 库和环境设定"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import plotly\n",
    "os.environ['JAVA_HOME'] = '/usr/local/jdk1.8.0_162'#必须保留，否则无法启动JVM\n",
    "from pyspark.sql import SparkSession\n",
    "spark = SparkSession.builder.master(\"local[16]\").appName(\"spark-demo\").getOrCreate()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2.读取总数据和标签数据"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**raw_data为通过spark读取的原始数据**\n",
    "\n",
    "格式为pyspark.sql.dataframe.DataFrame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 3 ms, sys: 0 ns, total: 3 ms\n",
      "Wall time: 2.65 s\n"
     ]
    }
   ],
   "source": [
    "%time raw_data=spark.read.parquet('/data/final_data/data/')#1G内存"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**label_raw_data为通过读取的带有标签的原始数据**\n",
    "\n",
    "格式为pyspark.sql.dataframe.DataFrame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 2.93 ms, sys: 821 µs, total: 3.75 ms\n",
      "Wall time: 3.11 s\n"
     ]
    }
   ],
   "source": [
    "%time label_raw_data=spark.read.parquet('/data/final_data/label/user_name_label_give')#1G内存"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3.将两个原始数据转成对应的pandas数据"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**raw_pd_data为转成pandas数据格式的总数据**\n",
    "\n",
    "**label_pd_raw_data为转成pands数据格式的标签数据**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 43.4 s, sys: 3.54 s, total: 46.9 s\n",
      "Wall time: 1min 9s\n"
     ]
    }
   ],
   "source": [
    "%time raw_pd_data=raw_data.toPandas()#运行时间非常长,读一次需要较大内存13G   需要用的时候再开启"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 30.9 ms, sys: 3.78 ms, total: 34.7 ms\n",
      "Wall time: 1.11 s\n"
     ]
    }
   ],
   "source": [
    "%time label_pd_raw_data=label_raw_data.toPandas()#不到1G"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 清理内存占用（保留）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "#raw_data=None\n",
    "#label_raw_data=None#释放1G内存"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "#raw_pd_data=None#释放4G内存\n",
    "#label_pd_raw_data=None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "label_pd_raw_data.to_csv('/data/csv/label.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4.构建标签用户事件数据"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 4.1获取list[包含所有标签用户user_name]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**tmp_label_user_list 为临时变量**\n",
    "\n",
    "**label_user_list 为 包含所有标签用户名的list**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "tmp_label_user_list=label_raw_data.select('user_name').collect()\n",
    "label_user_list=[int(row.user_name) for row in tmp_label_user_list]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 4.2 获取user_label 字典"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**dict_user_label为提取的标签字典 user:label**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "dict_user_label=label_pd_raw_data.set_index('user_name')['label'].to_dict()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4.3 获取有标签用户的事件"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**label_events为所有标签用户事件（带标签）**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 2.53 s, sys: 1.31 s, total: 3.83 s\n",
      "Wall time: 23.4 s\n"
     ]
    }
   ],
   "source": [
    "%time label_event_tmp1=raw_data.filter(raw_data['user_name'].isin(label_user_list)) #使用df.filter 筛选 有标签用户在总数据集中的事件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "label_event_tmp2=label_event_tmp1.toPandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "label_event_tmp3=label_event_tmp2.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "label_event_tmp3['label'] = label_event_tmp3['user_name'].map(dict_user_label)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "tmp_good_bad_dict={'good':0,'bad':1}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "label_event_tmp4=label_event_tmp3.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "label_event_tmp4['label']=label_event_tmp3['label'].map(tmp_good_bad_dict)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4.4 IP分段"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "label_event_tmp4.replace(to_replace=r'^\\s*$',value=np.nan,regex=True,inplace=True)   #将空字符替换为nan\n",
    "\n",
    "\n",
    "label_event_tmp5=label_event_tmp4['ip'].str.split('.', expand=True,n = 3)\n",
    "label_event_tmp5.columns=['ip_1','ip_2','ip_3','ip_4']\n",
    "label_event_tmp6=pd.concat([label_event_tmp5,label_event_tmp4], axis=1) #数据合并"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ip_1</th>\n",
       "      <th>ip_2</th>\n",
       "      <th>ip_3</th>\n",
       "      <th>ip_4</th>\n",
       "      <th>ip</th>\n",
       "      <th>ip_city</th>\n",
       "      <th>email_prefix</th>\n",
       "      <th>email_provider</th>\n",
       "      <th>event_type</th>\n",
       "      <th>mobile_prefix_3</th>\n",
       "      <th>...</th>\n",
       "      <th>user_name</th>\n",
       "      <th>user_agent</th>\n",
       "      <th>os_version</th>\n",
       "      <th>resource_owner</th>\n",
       "      <th>register_type</th>\n",
       "      <th>category</th>\n",
       "      <th>status</th>\n",
       "      <th>resource_type</th>\n",
       "      <th>resource_category</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1348648</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>183663</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1069</td>\n",
       "      <td>126</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1238148</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>353041</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1515</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>20487</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>260127</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>401</td>\n",
       "      <td>85</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1097570</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>563412</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1167</td>\n",
       "      <td>72</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>479461</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1224337</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1515</td>\n",
       "      <td>11</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1348648</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1625352</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>102</td>\n",
       "      <td>86</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1097570</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1672519</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1167</td>\n",
       "      <td>33</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1348648</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1630966</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1069</td>\n",
       "      <td>137</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1097570</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>391718</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1167</td>\n",
       "      <td>33</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1321969</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>557703</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>102</td>\n",
       "      <td>110</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1238148</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1635240</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1110</td>\n",
       "      <td>91</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1321969</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>245612</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1167</td>\n",
       "      <td>110</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1504169</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>629467</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1383</td>\n",
       "      <td>33</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1348648</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1172557</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>528</td>\n",
       "      <td>33</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1522376</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>463322</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1167</td>\n",
       "      <td>11</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1504169</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>484628</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1167</td>\n",
       "      <td>125</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1556252</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1097978</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1515</td>\n",
       "      <td>136</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>101986</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>288741</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1515</td>\n",
       "      <td>9</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1399609</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>577284</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1069</td>\n",
       "      <td>98</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1238148</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>531910</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>661</td>\n",
       "      <td>91</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1504169</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>251403</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>661</td>\n",
       "      <td>91</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>324364</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>532559</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>823</td>\n",
       "      <td>35</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1399609</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>558233</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1167</td>\n",
       "      <td>91</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1589753</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>651200</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1515</td>\n",
       "      <td>33</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1348648</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>500204</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1515</td>\n",
       "      <td>33</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1550975</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>528451</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>384</td>\n",
       "      <td>98</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1321969</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1246807</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1167</td>\n",
       "      <td>11</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1647505</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>231095</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1069</td>\n",
       "      <td>98</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1348648</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>603231</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1515</td>\n",
       "      <td>83</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>132340</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>300058</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1515</td>\n",
       "      <td>11</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>70297</th>\n",
       "      <td>221</td>\n",
       "      <td>218</td>\n",
       "      <td>195</td>\n",
       "      <td>127</td>\n",
       "      <td>221.218.195.127</td>\n",
       "      <td>北京</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1597163</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>70298</th>\n",
       "      <td>123</td>\n",
       "      <td>115</td>\n",
       "      <td>51</td>\n",
       "      <td>14</td>\n",
       "      <td>123.115.51.14</td>\n",
       "      <td>北京</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1597163</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>70299</th>\n",
       "      <td>123</td>\n",
       "      <td>115</td>\n",
       "      <td>51</td>\n",
       "      <td>14</td>\n",
       "      <td>123.115.51.14</td>\n",
       "      <td>北京</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1597163</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>70300</th>\n",
       "      <td>117</td>\n",
       "      <td>136</td>\n",
       "      <td>76</td>\n",
       "      <td>27</td>\n",
       "      <td>117.136.76.27</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1601204</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>70301</th>\n",
       "      <td>223</td>\n",
       "      <td>104</td>\n",
       "      <td>22</td>\n",
       "      <td>167</td>\n",
       "      <td>223.104.22.167</td>\n",
       "      <td>南宁</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1601204</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>70302</th>\n",
       "      <td>171</td>\n",
       "      <td>36</td>\n",
       "      <td>0</td>\n",
       "      <td>66</td>\n",
       "      <td>171.36.0.66</td>\n",
       "      <td>南宁</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1601204</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>70303</th>\n",
       "      <td>117</td>\n",
       "      <td>136</td>\n",
       "      <td>76</td>\n",
       "      <td>27</td>\n",
       "      <td>117.136.76.27</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1601204</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>70304</th>\n",
       "      <td>171</td>\n",
       "      <td>36</td>\n",
       "      <td>0</td>\n",
       "      <td>66</td>\n",
       "      <td>171.36.0.66</td>\n",
       "      <td>南宁</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1601204</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>70305</th>\n",
       "      <td>171</td>\n",
       "      <td>36</td>\n",
       "      <td>0</td>\n",
       "      <td>66</td>\n",
       "      <td>171.36.0.66</td>\n",
       "      <td>南宁</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1601204</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>70306</th>\n",
       "      <td>223</td>\n",
       "      <td>104</td>\n",
       "      <td>22</td>\n",
       "      <td>167</td>\n",
       "      <td>223.104.22.167</td>\n",
       "      <td>南宁</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1601204</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>70307</th>\n",
       "      <td>113</td>\n",
       "      <td>222</td>\n",
       "      <td>202</td>\n",
       "      <td>222</td>\n",
       "      <td>113.222.202.222</td>\n",
       "      <td>长沙</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1609489</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>70308</th>\n",
       "      <td>218</td>\n",
       "      <td>18</td>\n",
       "      <td>232</td>\n",
       "      <td>228</td>\n",
       "      <td>218.18.232.228</td>\n",
       "      <td>深圳</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1622550</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>70309</th>\n",
       "      <td>112</td>\n",
       "      <td>28</td>\n",
       "      <td>151</td>\n",
       "      <td>206</td>\n",
       "      <td>112.28.151.206</td>\n",
       "      <td>合肥</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1624250</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>70310</th>\n",
       "      <td>106</td>\n",
       "      <td>38</td>\n",
       "      <td>52</td>\n",
       "      <td>170</td>\n",
       "      <td>106.38.52.170</td>\n",
       "      <td>北京</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1632623</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>70311</th>\n",
       "      <td>183</td>\n",
       "      <td>12</td>\n",
       "      <td>245</td>\n",
       "      <td>21</td>\n",
       "      <td>183.12.245.21</td>\n",
       "      <td>深圳</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1658629</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>70312</th>\n",
       "      <td>218</td>\n",
       "      <td>17</td>\n",
       "      <td>250</td>\n",
       "      <td>112</td>\n",
       "      <td>218.17.250.112</td>\n",
       "      <td>深圳</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1658629</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>70313</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>162197</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>259328</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1167</td>\n",
       "      <td>85</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>70314</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1597163</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1124355</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1383</td>\n",
       "      <td>11</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>70315</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>248160</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>505050</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1167</td>\n",
       "      <td>10</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>70316</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1609489</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>196591</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>102</td>\n",
       "      <td>117</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>70317</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>222274</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>308481</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1167</td>\n",
       "      <td>33</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>70318</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1517761</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>390436</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1515</td>\n",
       "      <td>44</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>70319</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1529267</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>363823</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1167</td>\n",
       "      <td>11</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>70320</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1601204</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1101473</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1515</td>\n",
       "      <td>91</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>70321</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1597163</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1557283</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1167</td>\n",
       "      <td>11</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>70322</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1529267</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>77690</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1167</td>\n",
       "      <td>12</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>70323</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1529267</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>500186</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1515</td>\n",
       "      <td>11</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>70324</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1531404</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>288741</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1069</td>\n",
       "      <td>58</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>70325</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1531404</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>613863</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1515</td>\n",
       "      <td>58</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>70326</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1292930</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1567706</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>410</td>\n",
       "      <td>98</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>70327 rows × 22 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      ip_1 ip_2 ip_3 ip_4   ip ip_city email_prefix email_provider event_type  \\\n",
       "0      NaN  NaN  NaN  NaN  NaN     NaN         None           None          1   \n",
       "1      NaN  NaN  NaN  NaN  NaN     NaN         None           None          1   \n",
       "2      NaN  NaN  NaN  NaN  NaN     NaN         None           None          1   \n",
       "3      NaN  NaN  NaN  NaN  NaN     NaN         None           None          1   \n",
       "4      NaN  NaN  NaN  NaN  NaN     NaN         None           None          1   \n",
       "...    ...  ...  ...  ...  ...     ...          ...            ...        ...   \n",
       "70322  NaN  NaN  NaN  NaN  NaN     NaN         None           None          1   \n",
       "70323  NaN  NaN  NaN  NaN  NaN     NaN         None           None          1   \n",
       "70324  NaN  NaN  NaN  NaN  NaN     NaN         None           None          1   \n",
       "70325  NaN  NaN  NaN  NaN  NaN     NaN         None           None          1   \n",
       "70326  NaN  NaN  NaN  NaN  NaN     NaN         None           None          1   \n",
       "\n",
       "      mobile_prefix_3  ... user_name user_agent os_version resource_owner  \\\n",
       "0                 NaN  ...   1348648          0          0         183663   \n",
       "1                 NaN  ...   1238148          0          0         353041   \n",
       "2                 NaN  ...     20487          0          0         260127   \n",
       "3                 NaN  ...   1097570          0          0         563412   \n",
       "4                 NaN  ...    479461          0          0        1224337   \n",
       "...               ...  ...       ...        ...        ...            ...   \n",
       "70322             NaN  ...   1529267          0          0          77690   \n",
       "70323             NaN  ...   1529267          0          0         500186   \n",
       "70324             NaN  ...   1531404          0          0         288741   \n",
       "70325             NaN  ...   1531404          0          0         613863   \n",
       "70326             NaN  ...   1292930          0          0        1567706   \n",
       "\n",
       "      register_type category status resource_type resource_category label  \n",
       "0              None     None   None          1069               126     0  \n",
       "1              None     None   None          1515                 6     0  \n",
       "2              None     None   None           401                85     1  \n",
       "3              None     None   None          1167                72     1  \n",
       "4              None     None   None          1515                11     1  \n",
       "...             ...      ...    ...           ...               ...   ...  \n",
       "70322          None     None   None          1167                12     1  \n",
       "70323          None     None   None          1515                11     1  \n",
       "70324          None     None   None          1069                58     0  \n",
       "70325          None     None   None          1515                58     0  \n",
       "70326          None     None   None           410                98     0  \n",
       "\n",
       "[70327 rows x 22 columns]"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "label_event_tmp6"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ip_1</th>\n",
       "      <th>ip_2</th>\n",
       "      <th>ip_3</th>\n",
       "      <th>ip_4</th>\n",
       "      <th>ip</th>\n",
       "      <th>ip_city</th>\n",
       "      <th>email_prefix</th>\n",
       "      <th>email_provider</th>\n",
       "      <th>event_type</th>\n",
       "      <th>mobile_prefix_3</th>\n",
       "      <th>...</th>\n",
       "      <th>time_stamp</th>\n",
       "      <th>user_name</th>\n",
       "      <th>user_agent</th>\n",
       "      <th>os_version</th>\n",
       "      <th>resource_owner</th>\n",
       "      <th>register_type</th>\n",
       "      <th>category</th>\n",
       "      <th>status</th>\n",
       "      <th>resource_type</th>\n",
       "      <th>resource_category</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>30747</td>\n",
       "      <td>30747</td>\n",
       "      <td>30747</td>\n",
       "      <td>30747</td>\n",
       "      <td>30747</td>\n",
       "      <td>29683</td>\n",
       "      <td>35</td>\n",
       "      <td>35</td>\n",
       "      <td>70327</td>\n",
       "      <td>243</td>\n",
       "      <td>...</td>\n",
       "      <td>70327</td>\n",
       "      <td>70327</td>\n",
       "      <td>70327</td>\n",
       "      <td>70327</td>\n",
       "      <td>36765</td>\n",
       "      <td>335</td>\n",
       "      <td>3542</td>\n",
       "      <td>3542</td>\n",
       "      <td>37425</td>\n",
       "      <td>37425</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>unique</th>\n",
       "      <td>146</td>\n",
       "      <td>256</td>\n",
       "      <td>256</td>\n",
       "      <td>273</td>\n",
       "      <td>17365</td>\n",
       "      <td>325</td>\n",
       "      <td>34</td>\n",
       "      <td>5</td>\n",
       "      <td>5</td>\n",
       "      <td>34</td>\n",
       "      <td>...</td>\n",
       "      <td>67876</td>\n",
       "      <td>9502</td>\n",
       "      <td>389</td>\n",
       "      <td>40</td>\n",
       "      <td>26023</td>\n",
       "      <td>4</td>\n",
       "      <td>16</td>\n",
       "      <td>6</td>\n",
       "      <td>300</td>\n",
       "      <td>129</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>top</th>\n",
       "      <td>113</td>\n",
       "      <td>25</td>\n",
       "      <td>242</td>\n",
       "      <td>100</td>\n",
       "      <td>175.25.242.100</td>\n",
       "      <td>北京</td>\n",
       "      <td>25783</td>\n",
       "      <td>819</td>\n",
       "      <td>1</td>\n",
       "      <td>186</td>\n",
       "      <td>...</td>\n",
       "      <td>2017-10-13 15:08:12</td>\n",
       "      <td>1324185</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1212321</td>\n",
       "      <td>7</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1167</td>\n",
       "      <td>91</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>freq</th>\n",
       "      <td>1463</td>\n",
       "      <td>905</td>\n",
       "      <td>817</td>\n",
       "      <td>674</td>\n",
       "      <td>565</td>\n",
       "      <td>5218</td>\n",
       "      <td>2</td>\n",
       "      <td>18</td>\n",
       "      <td>36765</td>\n",
       "      <td>22</td>\n",
       "      <td>...</td>\n",
       "      <td>28</td>\n",
       "      <td>428</td>\n",
       "      <td>62389</td>\n",
       "      <td>62474</td>\n",
       "      <td>183</td>\n",
       "      <td>299</td>\n",
       "      <td>2812</td>\n",
       "      <td>2072</td>\n",
       "      <td>8957</td>\n",
       "      <td>5660</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>4 rows × 21 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         ip_1   ip_2   ip_3   ip_4              ip ip_city email_prefix  \\\n",
       "count   30747  30747  30747  30747           30747   29683           35   \n",
       "unique    146    256    256    273           17365     325           34   \n",
       "top       113     25    242    100  175.25.242.100      北京        25783   \n",
       "freq     1463    905    817    674             565    5218            2   \n",
       "\n",
       "       email_provider event_type mobile_prefix_3  ...           time_stamp  \\\n",
       "count              35      70327             243  ...                70327   \n",
       "unique              5          5              34  ...                67876   \n",
       "top               819          1             186  ...  2017-10-13 15:08:12   \n",
       "freq               18      36765              22  ...                   28   \n",
       "\n",
       "       user_name user_agent os_version resource_owner register_type category  \\\n",
       "count      70327      70327      70327          36765           335     3542   \n",
       "unique      9502        389         40          26023             4       16   \n",
       "top      1324185          0          0        1212321             7        0   \n",
       "freq         428      62389      62474            183           299     2812   \n",
       "\n",
       "       status resource_type resource_category  \n",
       "count    3542         37425             37425  \n",
       "unique      6           300               129  \n",
       "top         0          1167                91  \n",
       "freq     2072          8957              5660  \n",
       "\n",
       "[4 rows x 21 columns]"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "label_event_tmp6.describe(include='O')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 70327 entries, 0 to 70326\n",
      "Data columns (total 22 columns):\n",
      "ip_1                 30747 non-null object\n",
      "ip_2                 30747 non-null object\n",
      "ip_3                 30747 non-null object\n",
      "ip_4                 30747 non-null object\n",
      "ip                   30747 non-null object\n",
      "ip_city              29683 non-null object\n",
      "email_prefix         35 non-null object\n",
      "email_provider       35 non-null object\n",
      "event_type           70327 non-null object\n",
      "mobile_prefix_3      243 non-null object\n",
      "mobile_city          243 non-null object\n",
      "time_stamp           70327 non-null object\n",
      "user_name            70327 non-null object\n",
      "user_agent           70327 non-null object\n",
      "os_version           70327 non-null object\n",
      "resource_owner       36765 non-null object\n",
      "register_type        335 non-null object\n",
      "category             3542 non-null object\n",
      "status               3542 non-null object\n",
      "resource_type        37425 non-null object\n",
      "resource_category    37425 non-null object\n",
      "label                70327 non-null int64\n",
      "dtypes: int64(1), object(21)\n",
      "memory usage: 11.8+ MB\n"
     ]
    }
   ],
   "source": [
    "label_event_tmp6.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "ename": "ValueError",
     "evalue": "Unable to parse string \"unknown, 123\" at position 34089",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[0;32mpandas/_libs/lib.pyx\u001b[0m in \u001b[0;36mpandas._libs.lib.maybe_convert_numeric\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;31mValueError\u001b[0m: Unable to parse string \"unknown, 123\"",
      "\nDuring handling of the above exception, another exception occurred:\n",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-21-b59b238e4c99>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m for feature in (\"ip_1\",\"ip_2\",\"ip_3\",\"ip_4\",\"email_prefix\",\"email_provider\",\"event_type\",\"mobile_prefix_3\",\"user_name\",\"user_agent\",\"os_version\",\n\u001b[1;32m      2\u001b[0m            'resource_owner','register_type','category','status','resource_type','resource_category'):\n\u001b[0;32m----> 3\u001b[0;31m     \u001b[0mlabel_event_tmp6\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mfeature\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_numeric\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlabel_event_tmp6\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mfeature\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mdowncast\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'signed'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      4\u001b[0m \u001b[0mlabel_event_tmp6\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'time_stamp'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_datetime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlabel_event_tmp6\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'time_stamp'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0mlabel_event_tmp6\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/opt/anaconda3/lib/python3.7/site-packages/pandas/core/tools/numeric.py\u001b[0m in \u001b[0;36mto_numeric\u001b[0;34m(arg, errors, downcast)\u001b[0m\n\u001b[1;32m    149\u001b[0m             \u001b[0mcoerce_numeric\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0merrors\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m\"ignore\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"raise\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    150\u001b[0m             values = lib.maybe_convert_numeric(\n\u001b[0;32m--> 151\u001b[0;31m                 \u001b[0mvalues\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcoerce_numeric\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcoerce_numeric\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    152\u001b[0m             )\n\u001b[1;32m    153\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32mpandas/_libs/lib.pyx\u001b[0m in \u001b[0;36mpandas._libs.lib.maybe_convert_numeric\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;31mValueError\u001b[0m: Unable to parse string \"unknown, 123\" at position 34089"
     ]
    }
   ],
   "source": [
    "for feature in (\"ip_1\",\"ip_2\",\"ip_3\",\"ip_4\",\"email_prefix\",\"email_provider\",\"event_type\",\"mobile_prefix_3\",\"user_name\",\"user_agent\",\"os_version\",\n",
    "           'resource_owner','register_type','category','status','resource_type','resource_category'):\n",
    "    label_event_tmp6[feature]=pd.to_numeric(label_event_tmp6[feature],downcast='signed')\n",
    "label_event_tmp6['time_stamp']=pd.to_datetime(label_event_tmp6['time_stamp'])\n",
    "label_event_tmp6.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "label_event_tmp6.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "label_events=label_event_tmp6#label_events为所有标签用户事件（带标签）"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5.几个简单的数据分析图"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 5.1用户与资源所有者的分布"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.scatter(label_events['user_name'],label_events['resource_owner'])\n",
    "plt.title('user_name and resource_owner')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "label_events[label_events['event_type'] == 2].head(4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "label_events[label_events['event_type'] == 1].head(4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#pdf.replace(to_replace=r'^\\s*$',value=np.nan,regex=True,inplace=True)   #将空字符替换为nan\n",
    "#pdf_notnull = pdf_notnull[pdf['ip'].notnull()]   #取出ip不为空的行\n",
    "pd.concat([pdf['ip'].str.split('.', expand=True,n = 3),pdf], axis=1) #数据合并"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "label_events[label_events['event_type'] == 1].tcshrco"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPy import IP"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_clean=raw_pd_data.fillna({'ip':\"-1\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in df_clean['ip']:\n",
    "    if i != '-1':\n",
    "        try:\n",
    "            df_clean.loc[df_clean['ip']== i, 'ip'] = IP(i).int()\n",
    "        except:\n",
    "            print(i)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "label_events.to_csv('/data/jupyter_root/label_events.csv',encoding=\"utf_8_sig\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw_date.filter([])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "raw_data.filter(raw_data['ip_city'] == '局域网')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "label_pd_raw_data.to_csv('/data/jupyter_root/dcube_data/evaluate.txt',sep=',',header=None,index=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "position": {
    "height": "338.797px",
    "left": "514.938px",
    "right": "20px",
    "top": "284.406px",
    "width": "529.922px"
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
